# Goal: Find the best set of questions measuring ideology dimensions
# by Jennifer Pan and Yiqing Xu

library("haven")
library("psych")
library("psy")

# step 1. fill in missing data
# step 2. find the best combinations

rm(list=ls(all=TRUE))
d <- haven::read_dta("data/sample2.dta")
d <- as.data.frame(d[which(d$wave2==1),])
ls()
names(d)
dim(d)

# order: nationalism, political, economic, traditional, equality, ethnicity
QSet <- list(paste0("s1_",1:14),
             paste0("s2_",1:14),
             paste0("s3_",1:14),
             paste0("s4_",1:7),
             paste0("s5_",1:7),
             paste0("s6_",1:7),
             paste0("t1_",1:14),
             paste0("t2_",1:14),
             paste0("t3_",1:14),
             paste0("t4_",1:7),
             paste0("t5_",1:7),
             paste0("t6_",1:7))

QSet1 <- list(paste0("s1_",1:14),
              paste0("s2_",1:14),
              paste0("s3_",1:14),
              paste0("s4_",1:7),
              paste0("s5_",1:7),
              paste0("s6_",1:7))

QSet2 <- list(paste0("t1_",1:14),
              paste0("t2_",1:14),
              paste0("t3_",1:14),
              paste0("t4_",1:7),
              paste0("t5_",1:7),
              paste0("t6_",1:7))

Sign <- list(
  c(1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1), 
  c(-1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1),
  c(-1, 1, -1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1),
  c(1, -1, -1, -1, 1, 1, 1),
  c(-1, 1, -1,  1, 1, 1, -1),
  c(1, -1,  1, -1, -1, 1, -1))

#### Step 1. Fill missing data #####

## multiple imputation
library("Amelia")
set.seed(12324)
nsims <- 100
for (i in 1:length(QSet)) {
  vars <- QSet[[i]]
  mi.out <- amelia(d[,c("id",vars)], m = nsims,  idvars = "id", ords = vars,
                   parallel = "snow", ncpus = 16)
  out <- matrix(0, nrow(d), length(vars))
  for (i in 1:nsims) {
    imp <- mi.out$imputations[[1]]
    out <- out + imp[,-1]
  }
  out <- out/nsims
  d[,vars] <- out
}

save(d, QSet, QSet1, QSet2, Sign, file = "output/sample2_mi.RData")

#### Step 2. Get the best combinations #####

GetScore <- function(data, category, nitems = 5) {
  
  questions1 <- QSet[[category]]
  questions2 <- QSet2[[category]]
  
  ntot <- length(questions1)
  totcomb <- choose(ntot, nitems) # all combinations
  cat(nitems, "items;", totcomb, "combinations\n")
  
  ## data
  data1 <- data[,questions1]
  data2 <- data[,questions2]
  sign0 <- Sign[[category]]
  
  ## Storage
  comb <- combn(ntot, nitems)
  result <- as.data.frame(matrix(NA, totcomb, 5)) # combinations * 4 indicators 
  colnames(result) <- c("id","SumSign","Cronbach","Stab","Pred")
  result[,1] <- 1:nrow(result)
  
  # full measure from wave2
  full2 <- (as.matrix(data2[,questions2])%*%matrix(sign0,ntot,1))/ntot          
  
  ## loop over all combinations  
  for (k in 1:ncol(comb)) {
    
    this.comb <- comb[,k] 
    sign <- sign0[this.comb]
    sum.sign <- abs(sum(sign))
    measure1 <-   c(as.matrix(data1[,this.comb])%*%matrix(sign,nitems,1))/nitems
    measure2 <-   c(as.matrix(data2[,this.comb])%*%matrix(sign,nitems,1))/nitems
    
    ## 1. Cronbach alpha
    if (nitems >1) {
      Cronbach <- psy::cronbach(data1[,this.comb])$alpha
    } else {
      Cronbach <- 0
    } 
    
    ## 2. Stability
    stab <- cor(measure1, measure2, use = "pairwise.complete.obs")
    
    ## 3. Predictability
    pred <- cor(measure1, full2, use = "pairwise.complete.obs")
    
    out <- c(sum.sign, Cronbach, stab, pred)
    result[k,c(2:5)] <- out        
    
  }
  result$Score <- floor((rank(-result[,2]) + rank(-result[,3]) + rank(-result[,3]))/3)
  out <- list(comb = comb, result = result)
  return(out)
}

out1 <- GetScore(d, 1, nitems = 5)
out2 <- GetScore(d, 2, nitems = 5)
out3 <- GetScore(d, 3, nitems = 5)
out4 <- GetScore(d, 4, nitems = 5)
out5 <- GetScore(d, 5, nitems = 5)
out6 <- GetScore(d, 6, nitems = 5)

save(out1, out2, out3, out4, out5, out6, file = "output/s_bestset.RData")
